{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn import preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = np.array([[3, -1.5,  2, -5.4], [0,  4,  -0.3, 2.1], [1,  3.3, -1.9, -4.3]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 3. , -1.5,  2. , -5.4],\n",
       "       [ 0. ,  4. , -0.3,  2.1],\n",
       "       [ 1. ,  3.3, -1.9, -4.3]])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3, 4)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean:  [ 1.33333333  1.93333333 -0.06666667 -2.53333333]\n",
      "Standard Deviation:  [1.24721913 2.44449495 1.60069429 3.30689515]\n"
     ]
    }
   ],
   "source": [
    "print(\"Mean: \",data.mean(axis=0))\n",
    "print(\"Standard Deviation: \",data.std(axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.247219128924647"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "math.sqrt((math.pow((3-1.33333333  ),2)+math.pow((0-1.33333333  ),2)+math.pow((1-1.33333333  ),2))/3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.3363062110825705"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(3-1.33333333)/1.24721913"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_standardized = preprocessing.scale(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean standardized data:  [ 5.55111512e-17 -1.11022302e-16 -7.40148683e-17 -7.40148683e-17]\n",
      "Standard Deviation standardized data:  [1. 1. 1. 1.]\n"
     ]
    }
   ],
   "source": [
    "print(\"Mean standardized data: \",data_standardized.mean(axis=0))\n",
    "print(\"Standard Deviation standardized data: \",data_standardized.std(axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1.33630621, -1.40451644,  1.29110641, -0.86687558],\n",
       "       [-1.06904497,  0.84543708, -0.14577008,  1.40111286],\n",
       "       [-0.26726124,  0.55907936, -1.14533633, -0.53423728]])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_standardized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 3. , -1.5,  2. , -5.4],\n",
       "       [ 0. ,  4. , -0.3,  2.1],\n",
       "       [ 1. ,  3.3, -1.9, -4.3]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))\n",
    "data_scaled = data_scaler.fit_transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.        , 0.        , 1.        , 0.        ],\n",
       "       [0.        , 1.        , 0.41025641, 1.        ],\n",
       "       [0.33333333, 0.87272727, 0.        , 0.14666667]])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_scaled"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Min:  [ 0.  -1.5 -1.9 -5.4]\n",
      "Max:  [3.  4.  2.  2.1]\n",
      "Min:  [0. 0. 0. 0.]\n",
      "Max:  [1. 1. 1. 1.]\n"
     ]
    }
   ],
   "source": [
    "print(\"Min: \",data.min(axis=0))\n",
    "print(\"Max: \",data.max(axis=0))\n",
    "\n",
    "print(\"Min: \",data_scaled.min(axis=0))\n",
    "print(\"Max: \",data_scaled.max(axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 3. , -1.5,  2. , -5.4],\n",
       "       [ 0. ,  4. , -0.3,  2.1],\n",
       "       [ 1. ,  3.3, -1.9, -4.3]])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_normalized = preprocessing.normalize(data, norm='l1',axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.75      , -0.17045455,  0.47619048, -0.45762712],\n",
       "       [ 0.        ,  0.45454545, -0.07142857,  0.1779661 ],\n",
       "       [ 0.25      ,  0.375     , -0.45238095, -0.36440678]])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_normalized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1. 1. 1. 1.]\n"
     ]
    }
   ],
   "source": [
    "data_norm_abs = np.abs(data_normalized)\n",
    "\n",
    "print(data_norm_abs.sum(axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_normalized = preprocessing.normalize(data, norm='l2',axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.9486833 , -0.27787309,  0.72074997, -0.74841361],\n",
       "       [ 0.        ,  0.7409949 , -0.1081125 ,  0.29104974],\n",
       "       [ 0.31622777,  0.61132079, -0.68471247, -0.59595899]])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_normalized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9486833100000001"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "0.31622777*3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1., 1., 1., 1.])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(data_normalized*data_normalized).sum(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.000000006218063"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "0.9486833*0.9486833+0+0.31622777*0.31622777"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 3. , -1.5,  2. , -5.4],\n",
       "       [ 0. ,  4. , -0.3,  2.1],\n",
       "       [ 1. ,  3.3, -1.9, -4.3]])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_binarized = preprocessing.Binarizer(threshold=3.4).transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0., 0., 0.],\n",
       "       [0., 1., 0., 0.],\n",
       "       [0., 0., 0., 0.]])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_binarized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1 1 2]\n",
      " [0 2 3]\n",
      " [1 0 1]\n",
      " [0 1 0]]\n"
     ]
    }
   ],
   "source": [
    "data = np.array([[1, 1, 2], [0, 2, 3], [1, 0, 1], [0, 1, 0]])\n",
    "print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:415: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
      "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
      "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
      "  warnings.warn(msg, FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "OneHotEncoder(categorical_features=None, categories=None, drop=None,\n",
       "              dtype=<class 'numpy.float64'>, handle_unknown='error',\n",
       "              n_values=None, sparse=True)"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder = preprocessing.OneHotEncoder()\n",
    "encoder.fit(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0. 1. 1. 0. 0. 0. 0. 0. 1.]]\n"
     ]
    }
   ],
   "source": [
    "encoded_vector = encoder.transform([[1, 0, 3]]).toarray()\n",
    "print(encoded_vector)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<1x9 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 3 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder.transform([[1, 0, 3]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 1., 1., 0., 0., 0., 0., 0., 1.]])"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoded_vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_encoder = preprocessing.LabelEncoder()\n",
    "\n",
    "input_classes = ['audi', 'ford', 'audi', 'toyota', 'ford', 'bmw']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['audi', 'ford', 'audi', 'toyota', 'ford', 'bmw']"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Class mapping: \n",
      "audi --> 0\n",
      "bmw --> 1\n",
      "ford --> 2\n",
      "toyota --> 3\n"
     ]
    }
   ],
   "source": [
    "label_encoder.fit(input_classes)\n",
    "print(\"Class mapping: \")\n",
    "for i, item in enumerate(label_encoder.classes_):\n",
    "    print(item, \"-->\", i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Labels = ['toyota', 'ford', 'audi']\n",
      "Encoded labels = [3, 2, 0]\n"
     ]
    }
   ],
   "source": [
    "labels = ['toyota', 'ford', 'audi']\n",
    "encoded_labels = label_encoder.transform(labels)\n",
    "print(\"Labels =\", labels)\n",
    "print(\"Encoded labels =\", list(encoded_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Encoded labels = [2, 1, 0, 3, 1]\n",
      "Decoded labels = ['ford', 'bmw', 'audi', 'toyota', 'bmw']\n"
     ]
    }
   ],
   "source": [
    "encoded_labels = [2, 1, 0, 3, 1]\n",
    "decoded_labels = label_encoder.inverse_transform(encoded_labels)\n",
    "print(\"Encoded labels =\", encoded_labels)\n",
    "print(\"Decoded labels =\", list(decoded_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Importing Dataset\n",
    "dataset = pd.read_csv(r'data_set\\purchased.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Country</th>\n",
       "      <th>Age</th>\n",
       "      <th>Salary</th>\n",
       "      <th>Purchased</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>France</td>\n",
       "      <td>44.0</td>\n",
       "      <td>72000.0</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Spain</td>\n",
       "      <td>27.0</td>\n",
       "      <td>48000.0</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>Germany</td>\n",
       "      <td>30.0</td>\n",
       "      <td>54000.0</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>Spain</td>\n",
       "      <td>38.0</td>\n",
       "      <td>61000.0</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>Germany</td>\n",
       "      <td>40.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Country   Age   Salary Purchased\n",
       "0   France  44.0  72000.0        No\n",
       "1    Spain  27.0  48000.0       Yes\n",
       "2  Germany  30.0  54000.0        No\n",
       "3    Spain  38.0  61000.0        No\n",
       "4  Germany  40.0      NaN       Yes"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = dataset.iloc[:, :-1].values\n",
    "Y = dataset.iloc[:, 3].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([['France', 44.0, 72000.0],\n",
       "       ['Spain', 27.0, 48000.0],\n",
       "       ['Germany', 30.0, 54000.0],\n",
       "       ['Spain', 38.0, 61000.0],\n",
       "       ['Germany', 40.0, nan],\n",
       "       ['France', 35.0, 58000.0],\n",
       "       ['Spain', nan, 52000.0],\n",
       "       ['France', 48.0, 79000.0],\n",
       "       ['Germany', 50.0, 83000.0],\n",
       "       ['France', 37.0, 67000.0]], dtype=object)"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.impute import SimpleImputer \n",
    "imputer=SimpleImputer(missing_values=np.nan,strategy='mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "imputer=imputer.fit(X[:,1:3])\n",
    "X[:,1:3]=imputer.transform(X[:,1:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([['France', 44.0, 72000.0],\n",
       "       ['Spain', 27.0, 48000.0],\n",
       "       ['Germany', 30.0, 54000.0],\n",
       "       ['Spain', 38.0, 61000.0],\n",
       "       ['Germany', 40.0, 63777.77777777778],\n",
       "       ['France', 35.0, 58000.0],\n",
       "       ['Spain', 38.77777777777778, 52000.0],\n",
       "       ['France', 48.0, 79000.0],\n",
       "       ['Germany', 50.0, 83000.0],\n",
       "       ['France', 37.0, 67000.0]], dtype=object)"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
